Let's have a look at some of the most basic numpy concepts that we will use in the more advanced sections
import numpy as np
import datetime as datetime
from matplotlib import dates
import pandas as pd
In order to test our functions, being able to create "random" data is the key
#Create a range based on your input. From 0 to 10, not including 10 and step size parameter 2
np.arange(0,10,2)
#Return evenly space 5 floats from 0 to 10, including 10
np.linspace(0,10,5)
#return 2 radom floats from a uniform distribution (all numbers have the same probability)
np.random.rand(2)
#return 2 radom floats from a normal distribution with mean = 0 and std = 1
np.random.randn(2)
#return 2 radom floats from a normal distribution with mean = 3 and std = 1
np.random.normal(3,1,2)
#Generate the same random numbers by setting a seed
#The number in the seed is irrelevant
#It will only work if we are on the same cell
np.random.seed(1)
print(np.random.rand(1))
np.random.seed(2)
print(np.random.rand(1))
np.random.seed(1)
print(np.random.rand(1))
#Creating a matrix from an array
#First create an array
arr = np.arange(4)
#Then reshape it
matrix = arr.reshape(2,2)
#return the min and max values of an array/matrix
print(arr.max())
print(matrix.min())
#Remember if you want to work with array copies use:
arr_copy = arr.copy()
#Otherwise you will be always editing the original array as well,
#since the new object created is pointing to the original object
#Create a matrix
matrix = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(matrix)
#Return the item in the column 1 and row 1
matrix[1][1]
#Return (from the matrix) until the 2 row (not included)
matrix[:2]
#Until row 2 and from column 1
matrix[:2,1:]
#Return a filtered array whose values are lower than 2
print("Original array: ")
print(arr)
print("Result: ")
print(arr[arr<2])
Skipping the basic ones
#Sum of all the values of the columns
print(matrix)
matrix.sum(axis=0)
#Sum of all the values of the rows
print(matrix)
matrix.sum(axis=1)
import pandas as pd
#Create a Matrix, which will be used for the dataframe creation
rand_mat = np.random.rand(5,4)
#Create dataframe
df = pd.DataFrame(data=rand_mat, index = 'A B C D E'.split(), columns = "R P U I".split())
#Drop row
df.drop("A")
#Drop column
df.drop("R",axis=1)
#Return series of the row A
df.loc["A"]
#Return series of the row number 2
df.iloc[2]
#Filtering by value. Filter all rows that are smaller than 0.3 in column I
df[df["I"]>0.3]
#Return a unique array of the column R
df["R"].unique()
#Return the number of unique items of the array of the column R
df["R"].nunique()
#Apply a function to a column
df["R"].apply(lambda a: a+1)
#Display plots directly in jupyter
%matplotlib inline
#Import data
df1 = pd.read_csv("./original/TSA_COURSE_NOTEBOOKS/03-Pandas-Visualization/df1.csv",index_col=0)
df2 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/03-Pandas-Visualization/df2.csv')
There are several plot types built into pandas; most of them are statistical by nature:
df.plot.hist() histogram df.plot.bar() bar chart df.plot.barh() horizontal bar chart df.plot.line() line chart df.plot.area() area chart df.plot.scatter() scatter plot df.plot.box() box plot df.plot.kde() kde plot df.plot.hexbin() hexagonal bin plot df.plot.pie() pie chart
You can also call specific plots by passing their name as an argument, as with df.plot(kind='area').
Let's start going through them! First we'll look at the data:
This is one of the most commonly used plots. Histograms describe the distribution of continuous data by dividing the data into "bins" of equal width, and plotting the number of values that fall into each bin. [reference]
df1['A'].plot.hist();
We can add settings to do things like bring the x- and y-axis values to the edge of the graph, and insert lines between vertical bins:
df1['A'].plot.hist(edgecolor='k').autoscale(enable=True, axis='both', tight=True)
You can use any matplotlib color spec for edgecolor, such as 'b', 'g', 'r', 'c', 'm', 'y', 'k', 'w', or the string representation of a float value for shades of grey, such as '0.5'
For autoscale the axis can be set to 'x', 'y' or 'both'
We can also change the number of bins (the range of values over which frequencies are calculated) from the default value of 10:
df1['A'].plot.hist(bins=40, edgecolor='k').autoscale(enable=True, axis='both', tight=True)
You can also access an histogram like this:
df1['A'].hist();
For more on using df.hist() visit https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.hist.html
Barplots are similar to histograms, except that they deal with discrete data, and often reflect multiple variables. [reference]
df2.plot.bar();
df2.plot.bar(stacked=True);
# USE .barh() TO DISPLAY A HORIZONTAL BAR PLOT
df2.plot.barh();
Line plots are used to compare two or more variables. By default the x-axis values are taken from the index. [reference]
Line plots happen to be the default pandas plot. They are accessible through df.plot() as well as df.plot.line()
df2.plot.line(y='a',figsize=(12,3),lw=2);
# Use lw to change the size of the line
df2.plot.line(y=['a','b','c'],figsize=(12,3),lw=3);
Area plots represent cumulatively stacked line plots where the space between lines is emphasized with colors. [reference]
df2.plot.area();
It often helps to mute the colors by passing an alpha transparency value between 0 and 1.
df2.plot.area(alpha=0.4);
To produce a blended area plot, pass a stacked=False argument:
df2.plot.area(stacked=False, alpha=0.4);
Scatter plots are a useful tool to quickly compare two variables, and to look for possible trends. [reference]
df1.plot.scatter(x='A',y='B');
You can use c to color each marker based off another column value. Use cmap to indicate which colormap to use.
For all the available colormaps, check out: http://matplotlib.org/users/colormaps.html
df1.plot.scatter(x='A',y='B',c='C',cmap='coolwarm');
Alternatively you can use s to indicate marker size based off another column. The s parameter needs to be an array, not just the name of a column:
df1.plot.scatter(x='A',y='B',s=df1['C']*50);
The warning above appeared because some df1['C'] values are negative. We can fix this finding the minimum value, writing a function that adds to each value, and applying our function to the data with .apply(func).
Also, these data points have a lot of overlap. We can address this issue by passing in an alpha blending value between 0 and 1 to make markers more transparent.
def add_three(val):
return val+3
df1.plot.scatter(x='A',y='B',s=df1['C'].apply(add_three)*45, alpha=0.2);
Box plots, aka "box and whisker diagrams", describe the distribution of data by dividing data into quartiles about the mean.
Look here for a description of boxplots. [reference]
df2.boxplot();
To draw boxplots based on groups, first pass in a list of columns you want plotted (including the groupby column), then pass by='columname' into .boxplot(). Here we'll group records by the 'e' column, and draw boxplots for the 'b' column.
df2[['b','e']].boxplot(by='e', grid=False);
In the next section on Customizing Plots we'll show how to change the title and axis labels.
In order to see the underlying distribution, which is similar to an histogram. These plots are accessible either through df.plot.kde() or df.plot.density() [reference]
df2['a'].plot.kde();
df2.plot.density();
Useful for Bivariate Data, alternative to scatterplot. [reference]
# FIRST CREATE A DATAFRAME OF RANDOM VALUES
df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b'])
# MAKE A HEXAGONAL BIN PLOT
df.plot.hexbin(x='a',y='b',gridsize=25,cmap='Oranges');
In this section we'll show how to control the position and appearance of axis labels and legends.
For more info on the following topics visit https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html
The pandas .plot() method takes optional arguments that allow you to control linestyles, colors, widths and more.
# START WITH A SIMPLE LINE PLOT
df2['c'].plot(figsize=(8,3));
df2['c'].plot.line(ls='-.', c='r', lw='4', figsize=(8,3));
For more on linestyles, click here.
# START WITH A SIMPLE MULTILINE PLOT
df2.plot(figsize=(8,3));
When we call df.plot(), pandas returns a matplotlib.axes.AxesSubplot object. We can set labels on that object so long as we do it in the same jupyter cell. Setting an autoscale is done the same way.
title='Custom Pandas Plot'
ylabel='Y-axis data'
xlabel='X-axis data'
ax = df2.plot(figsize=(8,3),title=title)
ax.set(xlabel=xlabel, ylabel=ylabel)
ax.autoscale(axis='x',tight=True);
Pandas tries to optimize placement of the legend to reduce overlap on the plot itself. However, we can control the placement ourselves, and even place the legend outside of the plot. We do this through the .legend() method. [reference]
For starters we can pass a location code. .legend(loc=1) places the legend in the upper-right corner of the plot.
Alternatively we can pass a location string: .legend(loc='upper right') does the same thing.
| LOCATION CODE | LOCATION STRING |
|---|---|
| 0 | 'best' |
| 1 | 'upper right' |
| 2 | 'upper left' |
| 3 | 'lower left' |
| 4 | 'lower right' |
| 5 | 'right' |
| 6 | 'center left' |
| 7 | 'center right' |
| 8 | 'lower center' |
| 9 | 'upper center' |
| 10 | 'center' |
ax = df2.plot(figsize=(8,3))
ax.autoscale(axis='x',tight=True)
ax.legend(loc=1);
We can pass a second argument, bbox_to_anchor that treats the value passed in through loc as an anchor point, and positions the legend along the x and y axes based on a two-value tuple.
# FIRST, PLACE THE LEGEND IN THE LOWER-LEFT
ax = df2.plot(figsize=(8,3))
ax.autoscale(axis='x',tight=True)
ax.legend(loc=3);
# NEXT, MOVE THE LEGEND A LITTLE TO THE RIGHT AND UP
ax = df2.plot(figsize=(8,3))
ax.autoscale(axis='x',tight=True)
ax.legend(loc=3, bbox_to_anchor=(0.1,0.1));
In the above plot we passed (0.1,0.1) as our two-item tuple. This places the legend slightly to the right and slightly upward.
To place the legend outside the plot on the right-hand side, pass a value greater than or equal to 1 as the first item in the tuple.
ax = df2.plot(figsize=(8,3))
ax.autoscale(axis='x',tight=True)
ax.legend(loc=3, bbox_to_anchor=(1.0,0.1));
We'll usually deal with time series as a datetime index when working with pandas dataframes. Fortunately pandas has a lot of functions and methods to work with time series!
For more on the pandas DatetimeIndex visit https://pandas.pydata.org/pandas-docs/stable/timeseries.html
Ways to build a DatetimeIndex:
# THE WEEK OF JULY 8TH, 2018
idx = pd.date_range('7/8/2018', periods=7, freq='D')
idx
idx = pd.to_datetime(['Jan 01, 2018','1/2/18','03-Jan-2018',None])
idx
# Create a NumPy datetime array
some_dates = np.array(['2016-03-15', '2017-05-24', '2018-08-09'], dtype='datetime64[D]')
some_dates
pd.to_datetime(['2/1/2018','3/1/2018'],format='%d/%m/%Y')
Note: the above code is a faster way of doing the following:
df = pd.read_csv('../Data/starbucks.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date',inplace=True)
# Index_col indicates that the index will be the column called 'Date'
# parse_dates, transforms the strings into datetime format
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/starbucks.csv', index_col='Date', parse_dates=True)
# Our index
df.index
When calling .resample() you first need to pass in a rule parameter, then you need to call some sort of aggregation function.
The rule parameter describes the frequency with which to apply the aggregation function (daily, monthly, yearly, etc.)
It is passed in using an "offset alias" - refer to the table below. [reference]
The aggregation function is needed because, due to resampling, we need some sort of mathematical rule to join the rows (mean, sum, count, etc.)
| ALIAS | DESCRIPTION |
|---|---|
| B | business day frequency |
| C | custom business day frequency (experimental) |
| D | calendar day frequency |
| W | weekly frequency |
| M | month end frequency |
| SM | semi-month end frequency (15th and end of month) |
| BM | business month end frequency |
| CBM | custom business month end frequency |
| MS | month start frequency |
| SMS | semi-month start frequency (1st and 15th) |
| BMS | business month start frequency |
| CBMS | custom business month start frequency |
| Q | quarter end frequency |
| intentionally left blank |
| ALIAS | DESCRIPTION |
|---|---|
| BQ | business quarter endfrequency |
| QS | quarter start frequency |
| BQS | business quarter start frequency |
| A | year end frequency |
| BA | business year end frequency |
| AS | year start frequency |
| BAS | business year start frequency |
| BH | business hour frequency |
| H | hourly frequency |
| T, min | minutely frequency |
| S | secondly frequency |
| L, ms | milliseconds |
| U, us | microseconds |
| N | nanoseconds |
Let's resample our dataframe, by using rule "A", which is year and frecuency and aggregate it with the mean
# Yearly Means
df.resample(rule='A').mean()
Resampling rule 'A' takes all of the data points in a given year, applies the aggregation function (in this case we calculate the mean), and reports the result as the last day of that year.
title = 'Monthly Max Closing Price for Starbucks'
df['Close'].resample('M').max().plot.bar(figsize=(16,6), title=title,color='#1f77b4');
Sometimes you may need to shift all your data up or down along the time series index. In fact, a lot of pandas built-in methods do this under the hood. This isn't something we'll do often in the course, but it's definitely good to know about this anyways!
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/starbucks.csv',index_col='Date',parse_dates=True)
This method shifts the entire date index a given number of rows, without regard for time periods (months & years).
It returns a modified copy of the original DataFrame.
In other words, it moves down all the rows down or up.
# We move down all the rows
df.shift(1).head()
# NOTE: You will lose that last piece of data that no longer has an index!
df.shift(1).tail()
We can choose to shift index values up or down without realigning the data by passing in a freq argument.
This method shifts dates to the next period based on a frequency code. Common codes are 'M' for month-end and 'A' for year-end.
Refer to the Time Series Offset Aliases table from the Time Resampling lecture for a full list of values, or click here.
# Shift everything to the end of the month
df.shift(periods=1, freq='M').head()
For more info on time shifting visit http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shift.html
A common process with time series is to create data based off of a rolling mean. The idea is to divide the data into "windows" of time, and then calculate an aggregate function for each window. In this way we obtain a simple moving average. Let's show how to do this easily with pandas!
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/starbucks.csv', index_col='Date', parse_dates=True)
df['Close'].plot(figsize=(12,5)).autoscale(axis='x',tight=True);
Now let's add in a rolling mean! This rolling method provides row entries, where every entry is then representative of the window.
# 7 day rolling mean
df.rolling(window=7).mean().head(15)
df['Close'].plot(figsize=(12,5)).autoscale(axis='x',tight=True)
df.rolling(window=30).mean()['Close'].plot();
Instead of calculating values for a rolling window of dates, what if you wanted to take into account everything from the start of the time series up to each point in time? For example, instead of considering the average over the last 7 days, we would consider all prior data in our expanding set of averages.
# Optional: specify a minimum number of periods to start from
df['Close'].expanding(min_periods=30).mean().plot(figsize=(12,5));
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/starbucks.csv',index_col='Date',parse_dates=True)
First we'll create a line plot that puts both 'Close' and 'Volume' on the same graph.
Remember that we can use df.plot() in place of df.plot.line()
df.index = pd.to_datetime(df.index)
df.plot();
title='Starbucks Closing Stock Prices'
ylabel='Closing Price (USD)'
xlabel='Closing Date'
ax = df['Close'].plot(figsize=(12,6),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
Thanks to the date index, we can make a selection like the following:
df['Close']['2017-01-01':'2017-03-01']
There are two ways we can set a specific span of time as an x-axis limit. We can plot a slice of the dataset, or we can pass x-limit values as an argument into df.plot().
The advantage of using a slice is that pandas automatically adjusts the y-limits accordingly.
The advantage of passing in arguments is that pandas automatically tightens the x-axis. Plus, if we're also setting y-limits this can improve readability.
# Dates are separated by a colon:
df['Close']['2017-01-01':'2017-03-01'].plot(figsize=(12,4)).autoscale(axis='x',tight=True);
# Dates are separated by a comma:
#Let's say we want to display the plot only from the 1st of january until the 1t of march
df['Close'].plot(figsize=(12,4),xlim=['2017-01-01','2017-03-01']);
Now let's focus on the y-axis limits to get a better sense of the shape of the data.
First we'll find out what upper and lower limits to use.
# FIND THE MINIMUM VALUE IN THE RANGE:
df.loc['2017-01-01':'2017-03-01']['Close'].min()
Let's add a title and axis labels to our subplot.
title='Starbucks Closing Stock Prices'
ylabel='Closing Price (USD)'
xlabel='Closing Date'
ax = df['Close'].plot(xlim=['2017-01-04','2017-03-01'],ylim=[51,57],figsize=(12,4),title=title)
ax.set(xlabel=xlabel, ylabel=ylabel);
We can pass arguments into .plot() to change the linestyle and color. Refer to the Customizing Plots lecture from the previous section for more options.
df['Close'].plot(xlim=['2017-01-01','2017-03-01'],ylim=[51,57],ls='--',c='r');
In this section we'll look at how to change the format and appearance of dates along the x-axis. To do this, we'll borrow a tool from matplotlib called dates.
The x-axis values can be divided into major and minor axes. For now, we'll work only with the major axis and learn how to set the spacing with .set_major_locator(). As you can see in the graph below, the X axis is not beautifully distributed
# CREATE OUR AXIS OBJECT
ax = df['Close'].plot(xlim=['2017-01-01','2017-03-01'],ylim=[51,57])
With set_major_locator we can solve this problem
# CREATE OUR AXIS OBJECT
ax = df['Close'].plot(xlim=['2017-01-01','2017-03-01'],ylim=[51,57])
# REMOVE PANDAS DEFAULT "Date" LABEL
ax.set(xlabel='')
# SET THE TICK LOCATOR AND FORMATTER FOR THE MAJOR AXIS
# byweekday = 0 means Monday
ax.xaxis.set_major_locator(dates.WeekdayLocator(byweekday=0))
Notice that dates are spaced one week apart. The dates themselves correspond with byweekday=0, or Mondays.
For a full list of locator options available from matplotlib.dates visit https://matplotlib.org/api/dates_api.html#date-tickers
Formatting follows the Python datetime strftime codes.
The following examples are based on datetime.datetime(2001, 2, 3, 16, 5, 6):
| CODE | MEANING | EXAMPLE |
|---|---|---|
| %Y | Year with century as a decimal number. | 2001 |
| %y | Year without century as a zero-padded decimal number. | 01 |
| %m | Month as a zero-padded decimal number. | 02 |
| %B | Month as locale’s full name. | February |
| %b | Month as locale’s abbreviated name. | Feb |
| %d | Day of the month as a zero-padded decimal number. | 03 |
| %A | Weekday as locale’s full name. | Saturday |
| %a | Weekday as locale’s abbreviated name. | Sat |
| %H | Hour (24-hour clock) as a zero-padded decimal number. | 16 |
| %I | Hour (12-hour clock) as a zero-padded decimal number. | 04 |
| %p | Locale’s equivalent of either AM or PM. | PM |
| %M | Minute as a zero-padded decimal number. | 05 |
| %S | Second as a zero-padded decimal number. | 06 |
| CODE | MEANING | EXAMPLE |
|---|---|---|
| %#m | Month as a decimal number. (Windows) | 2 |
| %-m | Month as a decimal number. (Mac/Linux) | 2 |
| %#x | Long date | Saturday, February 03, 2001 |
| %#c | Long date and time | Saturday, February 03, 2001 16:05:06 |
# USE THIS SPACE TO EXPERIMENT WITH DIFFERENT FORMATS
from datetime import datetime
datetime(2001, 2, 3, 16, 5, 6).strftime("%A, %B %d, %Y %I:%M:%S %p")
We use the set_major_formatter to format the display of the date in the plot
ax = df['Close'].plot(xlim=['2017-01-01','2017-03-01'],ylim=[51,57],title='2017 Starbucks Closing Stock Prices')
ax.set(xlabel='')
ax.xaxis.set_major_locator(dates.WeekdayLocator(byweekday=0))
ax.xaxis.set_major_formatter(dates.DateFormatter("%a-%B-%d"))
All of the tick marks we've used so far have belonged to the major axis. We can assign another level called the minor axis, perhaps to separate month names from days of the month.
ax = df['Close'].plot(xlim=['2017-01-01','2017-03-01'],ylim=[51,57],rot=0,title='2017 Starbucks Closing Stock Prices')
ax.set(xlabel='')
ax.xaxis.set_major_locator(dates.WeekdayLocator(byweekday=0))
ax.xaxis.set_major_formatter(dates.DateFormatter('%d'))
ax.xaxis.set_minor_locator(dates.MonthLocator())
ax.xaxis.set_minor_formatter(dates.DateFormatter('\n\n%b'))
We can add x and y axis gridlines that extend into the plot from each major tick mark.
ax = df['Close'].plot(xlim=['2017-01-01','2017-03-01'],ylim=[51,57],rot=0,title='2017 Starbucks Closing Stock Prices')
ax.set(xlabel='')
ax.xaxis.set_major_locator(dates.WeekdayLocator(byweekday=0))
ax.xaxis.set_major_formatter(dates.DateFormatter('%d'))
ax.xaxis.set_minor_locator(dates.MonthLocator())
ax.xaxis.set_minor_formatter(dates.DateFormatter('\n\n%b'))
ax.yaxis.grid(True)
ax.xaxis.grid(True)
Let's walk through a very simple example of using statsmodels!
For these exercises we'll be using a statsmodels built-in macroeconomics dataset:
US Macroeconomic Data for 1959Q1 - 2009Q3
Number of Observations - 203
Number of Variables - 14
Variable name definitions:
year - 1959q1 - 2009q3
quarter - 1-4
realgdp - Real gross domestic product (Bil. of chained 2005 US$,
seasonally adjusted annual rate)
realcons - Real personal consumption expenditures (Bil. of chained
2005 US$, seasonally adjusted annual rate)
realinv - Real gross private domestic investment (Bil. of chained
2005 US$, seasonally adjusted annual rate)
realgovt - Real federal consumption expenditures & gross investment
(Bil. of chained 2005 US$, seasonally adjusted annual rate)
realdpi - Real private disposable income (Bil. of chained 2005
US$, seasonally adjusted annual rate)
cpi - End of the quarter consumer price index for all urban
consumers: all items (1982-84 = 100, seasonally adjusted).
m1 - End of the quarter M1 nominal money stock (Seasonally
adjusted)
tbilrate - Quarterly monthly average of the monthly 3-month
treasury bill: secondary market rate
unemp - Seasonally adjusted unemployment rate (%)
pop - End of the quarter total population: all ages incl. armed
forces over seas
infl - Inflation rate (ln(cpi_{t}/cpi_{t-1}) * 400)
realint - Real interest rate (tbilrate - infl)from statsmodels.tsa.filters.hp_filter import hpfilter
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/macrodata.csv',index_col=0,parse_dates=True)
df.head()
ax = df['realgdp'].plot()
ax.autoscale(axis='x',tight=True)
ax.set(ylabel='REAL GDP');
The Hodrick-Prescott filter separates a time-series $y_t$ into a trend component $\tau_t$ and a cyclical component $c_t$
$y_t = \tau_t + c_t$
The components are determined by minimizing the following quadratic loss function, where $\lambda$ is a smoothing parameter:
$\min_{\\{ \tau_{t}\\} }\sum_{t=1}^{T}c_{t}^{2}+\lambda\sum_{t=1}^{T}\left[\left(\tau_{t}-\tau_{t-1}\right)-\left(\tau_{t-1}-\tau_{t-2}\right)\right]^{2}$
The $\lambda$ value above handles variations in the growth rate of the trend component.
When analyzing quarterly data, the default lambda value of 1600 is recommended. Use 6.25 for annual data, and 129,600 for monthly data.
The following function extracts the cycle and trend component and returns it
# Tuple unpacking
gdp_cycle, gdp_trend = hpfilter(df['realgdp'], lamb=1600)
#Both are a pandas.core.series.Series
Let's plot the results
gdp_cycle.plot();
gdp_trend.plot();
df['trend'] = gdp_trend
df[['trend','realgdp']].plot(figsize=(12,5)).autoscale(axis='x',tight=True);
As we begin working with endogenous data ("endog" for short) and start to develop forecasting models, it helps to identify and isolate factors working within the system that influence behavior. Here the name "endogenous" considers internal factors, while "exogenous" would relate to external forces. These fall under the category of state space models, and include decomposition (described below), and exponential smoothing (described in an upcoming section).
The decomposition of a time series attempts to isolate individual components such as error, trend, and seasonality (ETS). We've already seen a simplistic example of this in the Introduction to Statsmodels section with the Hodrick-Prescott filter. There we separated data into a trendline and a cyclical feature that mapped observed data back to the trend.
Statsmodels provides a seasonal decomposition tool we can use to separate out the different components. This lets us see quickly and visually what each component contributes to the overall behavior.
We apply an additive model when it seems that the trend is more linear and the seasonality and trend components seem to be constant over time (e.g. every year we add 10,000 passengers).
A multiplicative model is more appropriate when we are increasing (or decreasing) at a non-linear rate (e.g. each year we double the amount of passengers).
For these examples we'll use the International Airline Passengers dataset, which gives monthly totals in thousands from January 1949 to December 1960.
from statsmodels.tsa.seasonal import seasonal_decompose
airline = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
airline.dropna(inplace=True)
airline.plot();
The graph above shows clearle that a additive model fits better. The number of passangers grows with time
Using the seasonal_decompose from statsmodels we will decompose the time series into its components:
result = seasonal_decompose(airline['Thousands of Passengers'], model='multiplicative') # model='mul' also works
result.plot();
#press on tab to check the different components
result.seasonal.plot();
We've already shown how to create a simple moving average by applying a mean function to a rolling window.
For a quick review
airline['6-month-SMA'] = airline['Thousands of Passengers'].rolling(window=6).mean()
airline['12-month-SMA'] = airline['Thousands of Passengers'].rolling(window=12).mean()
airline.plot();
We just showed how to calculate the SMA based on some window. However, basic SMA has some weaknesses:
To help fix some of these issues, we can use an EWMA (Exponentially weighted moving average).
EWMA will allow us to reduce the lag effect from SMA and it will put more weight on values that occured more recently (by applying more weight to the more recent values, thus the name). The amount of weight applied to the most recent values will depend on the actual parameters used in the EWMA and the number of periods given a window size. Full details on Mathematics behind this can be found here. Here is the shorter version of the explanation behind EWMA.
The formula for EWMA is:
Where $x_t$ is the input value, $w_i$ is the applied weight (Note how it can change from $i=0$ to $t$), and $y_t$ is the output.
Now the question is, how do we define the weight term $w_i$?
This depends on the adjust parameter you provide to the .ewm() method.
When adjust=True (default) is used, weighted averages are calculated using weights equal to $w_i = (1 - \alpha)^i$ This is called Simple Exponential Smoothing which gives:
When adjust=False is specified, moving averages are calculated as:
yt &= (1 - \alpha) y{t-1} + \alpha x_t,\end{split}$
which is equivalent to using weights:
\begin{split}w_i = \begin{cases} \alpha (1 - \alpha)^i & \text{if } i < t \\ (1 - \alpha)^i & \text{if } i = t. \end{cases}\end{split}
When adjust=True we have $y_0=x_0$ and from the last representation above we have $y_t=\alpha x_t+(1−α)y_{t−1}$, therefore there is an assumption that $x_0$ is not an ordinary value but rather an exponentially weighted moment of the infinite series up to that point.
The use of adjust=True or adjust=False, has to do whether a series is finite or infinite. In fact if you take an infinite series and pass it in the adjust=True formula, you will end up having the adjust=False formula.
For the smoothing factor $\alpha$ one must have $0<\alpha≤1$, and while it is possible to pass alpha directly, it’s often easier to think about either the span, center of mass (com) or half-life of an EW moment:
\begin{split}\alpha = \begin{cases} \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases}\end{split}Those are the ways alpha can be calculated, but you can also just input it manually
We have to pass precisely one of the above into the .ewm() function. For our data we'll use span=12.
Disadvantages of EWMA:
- Does not take into account trend and seasonality.
# With span = 12, we are using al alpha = 2/(12+1). Look at the above formula.
# Therefore when we pass the parameter span in the formula, we are setting alpha
airline['EWMA12'] = airline['Thousands of Passengers'].ewm(span=12,adjust=False).mean()
airline[['Thousands of Passengers','EWMA12']].plot();
airline['EWMA12'] = airline['Thousands of Passengers'].ewm(span=12,adjust=False).mean()
airline[['Thousands of Passengers','EWMA12']].plot();
airline[['Thousands of Passengers','EWMA12','12-month-SMA']].plot(figsize=(12,8)).autoscale(axis='x',tight=True);
In the previous section on Exponentially Weighted Moving Averages (EWMA) we applied Simple Exponential Smoothing using just one smoothing factor $\alpha$ (alpha). This failed to account for other contributing factors like trend and seasonality.
In this section we'll look at Double and Triple Exponential Smoothing with the Holt-Winters Methods.
In Double Exponential Smoothing (double is because we have 2 parameters) (aka Holt's Method) we introduce a new smoothing factor $\beta$ (beta) that addresses trend :
\begin{split}l_t &= (1 - \alpha) l_{t-1} + \alpha x_t, & \text{ level}\\ b_t &= (1-\beta)b_{t-1} + \beta(l_t-l_{t-1}) & \text{ trend}\\ y_t &= l_t + b_t & \text{ fitted model}\\ \hat y_{t+h} &= l_t + hb_t & \text{ forecasting model (} h = \text{# periods into the future)}\end{split}Because we haven't yet considered seasonal fluctuations, the forecasting model is simply a straight sloped line extending from the most recent data point. We'll see an example of this in upcoming lectures.
With Triple Exponential Smoothing (aka the Holt-Winters Method) we introduce a smoothing factor $\gamma$ (gamma) that addresses seasonality:
\begin{split}l_t &= (1 - \alpha) l_{t-1} + \alpha x_t, & \text{ level}\\ b_t &= (1-\beta)b_{t-1} + \beta(l_t-l_{t-1}) & \text{ trend}\\ c_t &= (1-\gamma)c_{t-L} + \gamma(x_t-l_{t-1}-b_{t-1}) & \text{ seasonal}\\ y_t &= (l_t + b_t) c_t & \text{ fitted model}\\ \hat y_{t+m} &= (l_t + mb_t)c_{t-L+1+(m-1)modL} & \text{ forecasting model (} m = \text{# periods into the future)}\end{split}Here $L$ represents the number of divisions per cycle. In our case looking at monthly data that displays a repeating pattern each year, we would use $L=12$.
In general, higher values for $\alpha$, $\beta$ and $\gamma$ (values closer to 1), place more emphasis on recent data.
We worked on this above. Just as a reminder. A variation of the statmodels Holt-Winters function provides Simple Exponential Smoothing. We'll show that it performs the same calculation of the weighted moving average as the pandas .ewm() method:
$\begin{split}y_0 &= x_0 \\
y_t &= (1 - \alpha) y_{t-1} + \alpha x_t,\end{split}$
#We will keep using the arilines dataframe
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
df.dropna(inplace=True)
Note that our DatetimeIndex does not have a frequency. In order to build a Holt-Winters smoothing model, statsmodels needs to know the frequency of the data (whether it's daily, monthly etc.). Since observations occur at the start of each month, we'll use MS.
A full list of time series offset aliases can be found here.
df.index
df.index.freq = 'MS'
df.index
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
span = 12
alpha = 2/(span+1)
#as we did in the previous cells
df['EWMA12'] = df['Thousands of Passengers'].ewm(alpha=alpha,adjust=False).mean()
Now we are going to implement the SIMPLE EXP SMOOTHING model, but first let's understand what is going on with the code:
model = SimpleExpSmoothing(df['Thousands of Passengers'])
# With Tab you can check the various attributes of the object "model", like "fit"
# As paremeters in the fit method we have the smoothing_level which is alpha
fitted_model = model.fit(smoothing_level=alpha,optimized=False)
# This returns a HoltWintersResultsWrapper
fitted_model.fittedvalues
# We will simply solve the problem with the shift function
fitted_model.fittedvalues.shift(-1)
# we add the predictions to our dataframe
df['SES12'] = fitted_model.fittedvalues.shift(-1)
df.head()
Note that we could have simply put all the code in the cells into one line of code:
#df['SES12']=SimpleExpSmoothing(df['Thousands of Passengers']).fit(smoothing_level=alpha,optimized=False).fittedvalues.shift(-1)
df
# EWMA12 and SES12 are the same in this particular case
df.plot(figsize=(14,6))
Now that we have the Simple Exponential Smoothing, let's try the Double Exponential Smoothing, before we plot the results.
Where Simple Exponential Smoothing employs just one smoothing factor $\alpha$ (alpha), Double Exponential Smoothing adds a second smoothing factor $\beta$ (beta) that addresses trends in the data. Like the alpha factor, values for the beta factor fall between zero and one ($0<\beta≤1$). The benefit here is that the model can anticipate future increases or decreases where the level model would only work from recent calculations.
We can also address different types of change (growth/decay) in the trend. If a time series displays a straight-line sloped trend, you would use an additive adjustment. If the time series displays an exponential (curved) trend, you would use a multiplicative adjustment.
As we move toward forecasting, it's worth noting that both additive and multiplicative adjustments may become exaggerated over time, and require damping that reduces the size of the trend over future periods until it reaches a flat line.
from statsmodels.tsa.holtwinters import ExponentialSmoothing
#It's hard to say wethet the passengers data has a multiplicative or additive trend, we will assume it is additive
df['DESadd12'] = ExponentialSmoothing(df['Thousands of Passengers'], trend='add').fit().fittedvalues.shift(-1)
df.head()
The DESadd12 is almost the same as the number of passangers (the blue line is just behind the red)
df.plot(figsize=(12,5))
# Let's do some Zoom
df.iloc[:24].plot(figsize=(12,5))
Double exponential is clearly better than Simple
What about Tripple ?
Triple Exponential Smoothing, the method most closely associated with Holt-Winters, adds support for both trends and seasonality in the data.
df['TESadd12'] = ExponentialSmoothing(df['Thousands of Passengers'],trend='add',seasonal='add',seasonal_periods=12).fit().fittedvalues
df.head()
df['TESmul12'] = ExponentialSmoothing(df['Thousands of Passengers'],trend='mul',seasonal='mul',seasonal_periods=12).fit().fittedvalues
df.head()
df[['Thousands of Passengers','TESadd12','TESmul12']].plot(figsize=(12,6)).autoscale(axis='x',tight=True);
# Let's chekc for the first 2 years (24 months)
df[['Thousands of Passengers','TESadd12','TESmul12']].iloc[:24].plot(figsize=(12,6)).autoscale(axis='x',tight=True);
In the previous section we fit various smoothing models to existing data. So the purpose was not to forecast, but predict.
The purpose now is to predict what happens next.
What's our best guess for next month's value? For the next six months?
In this section we'll look to extend our models into the future. First we'll divide known data into training and testing sets, and evaluate the performance of a trained model on known test data.
The Forecasting Procedure looks like:
For this example we'll use the same airline_passengers dataset, and we'll split the data into 108 training records and 36 testing records. Then we'll evaluate the performance of the model.
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
df.index.freq = 'MS'
df.head()
train_data = df.iloc[:109] # Goes up to but not including 109
test_data = df.iloc[108:]
from statsmodels.tsa.holtwinters import ExponentialSmoothing
fitted_model = ExponentialSmoothing(train_data['Thousands of Passengers'],trend='mul',seasonal='mul',seasonal_periods=12).fit()
train_data['Thousands of Passengers'].plot(legend=True,label='TRAIN')
test_data['Thousands of Passengers'].plot(legend=True,label='TEST',figsize=(12,8));
test_predictions = fitted_model.forecast(36).rename('HW Forecast')
test_predictions.plot(legend=True, label="Prediction")
train_data['Thousands of Passengers'].plot(legend=True,label='TRAIN')
test_data['Thousands of Passengers'].plot(legend=True,label='TEST',figsize=(12,8));
test_predictions.plot(legend=True, label="Prediction")
The prediction (green line) seems to be quite accurate compared to the test data. Remember, the test data is real.
Recall and accuracy are not appropiate evaluation metrics for forecasting techniques. We need metrics designed for continuous values
Let's analyse how can we use the most baic regression evaluation metrics
* Mean Absolute Error
* Mean Squared Error
* Root Mean Square Error
from sklearn.metrics import mean_squared_error,mean_absolute_error
being y the true value and y hat the predicted value
Disadvantages: It does not take into account if a few predicted points are actually very far away from real points. Exmaple: imagine spending for a mktg campaign is very high in december (Xmas) and the rest of the months is similar, then the MAE might not show us the effect of that month, since it is a mean.
Therefore we have the MSE as a solution
mean_absolute_error(test_data, test_predictions)
To analyse the MAE, you can check the summary key figures of the test data
test_data.describe()
Since it is squared, those predicted points that are very far away from the real ones will have more importance in the calculation. You punish the model by having large errors.
Of course this means we cannot interpret directly the result (since it is squared). So we have the RMSE for this
Squaring it we get the units back in its original form (like std with the variance). The intepretation dependends then on the data. A RMSE of 20€ of the price of a house is a very good, but for candy it's not.
np.sqrt(mean_squared_error(test_data,test_predictions))
Our mean squarred error is less than our standard deviation. Therefore, the method is not doing a bad job
We will use the whole dataset now to predict the future values
#We creat the fitted model object
fitted_model = ExponentialSmoothing(train_data['Thousands of Passengers'],trend='mul',seasonal='mul',seasonal_periods=12).fit()
forecast_predictions= fitted_model.forecast(36)
df['Thousands of Passengers'].plot(figsize=(12,8))
forecast_predictions.plot();
In orange we have the predictions. Pretty accurate.
Time series data is said to be stationary if it does not exhibit trends or seasonality. That is, fluctuations in the data are entirely due to outside forces and noise. The file samples.csv contains made-up datasets that illustrate stationary and non-stationary data.
df2 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/samples.csv',index_col=0,parse_dates=True)
df2.head()
df2['a'].plot(ylim=[0,100],title="STATIONARY DATA").autoscale(axis='x',tight=True);
df2['b'].plot(ylim=[0,100],title="NON-STATIONARY DATA").autoscale(axis='x',tight=True);
Non-stationary data can be made to look stationary through differencing. A simple differencing method calculates the difference between consecutive points.
You can calculate manually the first order difference by substracting:
df2["b"]-df2["b"].shift(1)
But statsmodels has a function that does it for you
from statsmodels.tsa.statespace.tools import diff
df2['d1'] = diff(df2['b'],k_diff=1)
df2['d1'].plot(title="FIRST DIFFERENCE DATA").autoscale(axis='x',tight=True);
We'll investigate a variety of different forecasting models in upcoming sections, but they all stem from ARIMA.
ARIMA, or Autoregressive Integrated Moving Average is actually a combination of 3 models:
Moving Averages we've already seen with EWMA and the Holt-Winters Method.
Integration will apply differencing to make a time series stationary, which ARIMA requires.
Autoregression is explained in detail in the next section. Here we're going to correlate a current time series with a lagged version of the same series.
Once we understand the components, we'll investigate how to best choose the $p$, $d$ and $q$ values required by the model.
For times series the covariance (autocovariance) will be: ${\displaystyle \gamma_k = \frac 1 n \sum\limits_{t=1}^{n-k} (y_t - \bar{y})(y_{t+k}-\bar{y})}$
Say we have a time series with five observations: {13, 5, 11, 12, 9}.
We can quickly see that $n = 5$, the mean $\bar{y} = 10$, and we'll see that the variance $\sigma^2 = 8$.
The following calculations give us our covariance values:
$\gamma_0 = \frac {(13-10)(13-10)+(5-10)(5-10)+(11-10)(11-10)+(12-10)(12-10)+(9-10)(9-10)} 5 = \frac {40} 5 = 8.0 \\
\gamma_1 = \frac {(13-10)(5-10)+(5-10)(11-10)+(11-10)(12-10)+(12-10)(9-10)} 5 = \frac {-20} 5 = -4.0 \\
\gamma_2 = \frac {(13-10)(11-10)+(5-10)(12-10)+(11-10)(9-10)} 5 = \frac {-8} 5 = -1.6 \\
\gamma_3 = \frac {(13-10)(12-10)+(5-10)(9-10)} 5 = \frac {11} 5 = 2.2 \\
\gamma_4 = \frac {(13-10)(9-10)} 5 = \frac {-3} 5 = -0.6$
Note that $\gamma_0$ is just the population variance $\sigma^2$
Let's see if statsmodels gives us the same results! For this we'll create a fake dataset:
import pandas as pd
import numpy as np
%matplotlib inline
import statsmodels.api as sm
# Import the models we'll be using in this section
from statsmodels.tsa.stattools import acovf,acf,pacf,pacf_yw,pacf_ols
from pandas.plotting import lag_plot
import warnings
warnings.filterwarnings("ignore")
df = pd.DataFrame({'a':[13, 5, 11, 12, 9]})
arr = acovf(df['a'])
arr
Note that the number of terms in the calculations above are decreasing.
Statsmodels can return an "unbiased" autocovariance where instead of dividing by $n$ we divide by $n-k$.
$\gamma_0 = \frac {(13-10)(13-10)+(5-10)(5-10)+(11-10)(11-10)+(12-10)(12-10)+(9-10)(9-10)} {5-0} = \frac {40} 5 = 8.0 \\ \gamma_1 = \frac {(13-10)(5-10)+(5-10)(11-10)+(11-10)(12-10)+(12-10)(9-10)} {5-1} = \frac {-20} 4 = -5.0 \\ \gamma_2 = \frac {(13-10)(11-10)+(5-10)(12-10)+(11-10)(9-10)} {5-2} = \frac {-8} 3 = -2.67 \\ \gamma_3 = \frac {(13-10)(12-10)+(5-10)(9-10)} {5-3} = \frac {11} 2 = 5.5 \\ \gamma_4 = \frac {(13-10)(9-10)} {5-4} = \frac {-3} 1 = -3.0$
arr2 = acovf(df['a'],unbiased=True)
arr2
The correlation $\rho$ (rho) between two variables $y_1,y_2$ is given as:
where $E$ is the expectation operator, $\mu_{1},\sigma_{1}$ and $\mu_{2},\sigma_{2}$ are the means and standard deviations of $y_1$ and $y_2$.
When working with a single variable (i.e. autocorrelation) we would consider $y_1$ to be the original series and $y_2$ a lagged version of it. Note that with autocorrelation we work with $\bar y$, that is, the full population mean, and not the means of the reduced set of lagged factors (see note below).
Thus, the formula for $\rho_k$ for a time series at lag $k$ is:
${\displaystyle \rho_k = \frac {\sum\limits_{t=1}^{n-k} (y_t - \bar{y})(y_{t+k}-\bar{y})} {\sum\limits_{t=1}^{n} (y_t - \bar{y})^2}}$
This can be written in terms of the covariance constant $\gamma_k$ as:
${\displaystyle \rho_k = \frac {\gamma_k n} {\gamma_0 n} = \frac {\gamma_k} {\sigma^2}}$
For example,
$\rho_4 = \frac {\gamma_4} {\sigma^2} = \frac{-0.6} {8} = -0.075$
Note that ACF values are bound by -1 and 1. That is, ${\displaystyle -1 \leq \rho_k \leq 1}$
arr3 = acf(df['a'])
arr3
Partial autocorrelations measure the linear dependence of one variable after removing the effect of other variable(s) that affect both variables. That is, the partial autocorrelation at lag $k$ is the autocorrelation between $y_t$ and $y_{t+k}$ that is not accounted for by lags $1$ through $k−1$.
A common method employs the non-recursive Yule-Walker Equations:
$\phi_0 = 1\\ \phi_1 = \rho_1 = -0.50\\ \phi_2 = \frac {\rho_2 - {\rho_1}^2} {1-{\rho_1}^2} = \frac {(-0.20) - {(-0.50)}^2} {1-{(-0.50)}^2}= \frac {-0.45} {0.75} = -0.60$
As $k$ increases, we can solve for $\phi_k$ using matrix algebra and the Levinson–Durbin recursion algorithm which maps the sample autocorrelations $\rho$ to a Toeplitz diagonal-constant matrix. The full solution is beyond the scope of this course, but the setup is as follows:
$\displaystyle \begin{pmatrix}\rho_0&\rho_1&\cdots &\rho_{k-1}\\ \rho_1&\rho_0&\cdots &\rho_{k-2}\\ \vdots &\vdots &\ddots &\vdots \\ \rho_{k-1}&\rho_{k-2}&\cdots &\rho_0\\ \end{pmatrix}\quad \begin{pmatrix}\phi_{k1}\\\phi_{k2}\\\vdots\\\phi_{kk}\end{pmatrix} \mathbf = \begin{pmatrix}\rho_1\\\rho_2\\\vdots\\\rho_k\end{pmatrix}$
# it's 4 because we have 4 lags
# mle stands for maximum likelihood estimation --> this uses the biased estimated coefficients
# yw stands for yule-walker equations
arr4 = pacf_yw(df['a'],nlags=4,method='mle')
arr4
arr5 = pacf_yw(df['a'],nlags=4,method='unbiased')
arr5
instead of YW you can use as well OLS
arr6 = pacf_ols(df['a'],nlags=4)
arr6
The arrays returned by .acf(df) and .pacf_yw(df) show the magnitude of the autocorrelation for a given $y$ at time $t$. Before we look at plotting arrays, let's look at the data itself for evidence of autocorrelation.
Pandas has a built-in plotting function that plots increasing $y_t$ values on the horizontal axis against lagged versions of the values $y_{t+1}$ on the vertical axis. If a dataset is non-stationary with an upward trend, then neighboring values should trend in the same way. Let's look at the Airline Passengers dataset first.
# Load a non-stationary dataset
df1 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
df1.index.freq = 'MS'
# Load a stationary dataset
df2 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/DailyTotalFemaleBirths.csv',index_col='Date',parse_dates=True)
df2.index.freq = 'D'
Let's look at a lag plot of the non-stationary data set (the airline)
lag_plot(df1['Thousands of Passengers']);
This shows evidence of very strong autocorrelation
Let's look at a lag plot of the stationary data set (the births)
lag_plot(df2['Births']);
There is no autocorrelation. The number of births of today is not correlated with the number of births from yesterday
Plotting the magnitude of the autocorrelations over the first few (20-40) lags can say a lot about a time series.
For example, consider the stationary Daily Total Female Births dataset:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
# Now let's plot the autocorrelation at different lags
title = 'Autocorrelation: Airline Passengers'
lags = 40
plot_acf(df1,title=title,lags=lags);
There is clearly signs os seasonality in the data.
The shaded region is a 95% confidence interval. Which means that those points (those lags) outside the blue shaded area are more like to be correlated with the current year. While the higher the lag, the less likely is that there is signiciant correlation
title = 'Autocorrelation: Daily Female Births'
lags = 40
plot_acf(df2,title=title,lags=lags);
In a moving average model as we saw with Holt-Winters, we forecast the variable of interest using a linear combination of predictors. In our example we forecasted numbers of airline passengers in thousands based on a set of level, trend and seasonal predictors.
In an autoregression model, we forecast using a linear combination of past values of the variable. The term autoregression describes a regression of the variable against itself. An autoregression is run against a set of lagged values of order $p$.
where $c$ is a constant, $\phi_{1}$ and $\phi_{2}$ are lag coefficients up to order $p$, and $\varepsilon_{t}$ is white noise.
For example, an AR(1) model would follow the formula
$y_{t} = c + \phi_{1}y_{t-1} + \varepsilon_{t}$
whereas an AR(2) model would follow the formula
$y_{t} = c + \phi_{1}y_{t-1} + \phi_{2}y_{t-2} + \varepsilon_{t}$
and so on.
Note that the lag coeffients are usually less than one, as we usually restrict autoregressive models to stationary data.
Specifically, for an AR(1) model: $-1 \lt \phi_1 \lt 1$
and for an AR(2) model: $-1 \lt \phi_2 \lt 1, \ \phi_1 + \phi_2 \lt 1, \ \phi_2 - \phi_1 \lt 1$
Models AR(3) and higher become mathematically very complex. Fortunately statsmodels does all the heavy lifting for us.
For this exercise we'll look at monthly U.S. population estimates in thousands from January 2011 to December 2018 (96 records, 8 years of data). Population includes resident population plus armed forces overseas. The monthly estimate is the average of estimates for the first of the month and the first of the following month. Source: https://fred.stlouisfed.org/series/POPTHM
# Load specific forecasting tools
from statsmodels.tsa.ar_model import AR,ARResults
# Load the U.S. Population dataset
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/uspopulation.csv',index_col='DATE',parse_dates=True)
df.index.freq = 'MS'
title='U.S. Monthly Population Estimates'
ylabel='Pop. Est. (thousands)'
xlabel='' # we don't really need a label here
ax = df['PopEst'].plot(figsize=(12,5),title=title);
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
The goal in this section is to:
As a general rule you should set the length of your test set equal to your intended forecast size. That is, for a monthly dataset you might want to forecast out one more year. Therefore your test set should be one year long.
Using the previous month to predict is not perfect, but still we get an acceptable result
# Set one year for testing
train = df.iloc[:84]
test = df.iloc[84:]
model = AR(train['PopEst'])
AR1fit = model.fit(maxlag=1)
print(f'Lag: {AR1fit.k_ar}')
print(f'Coefficients:\n{AR1fit.params}')
# This is the general format for obtaining predictions
start=len(train)
end=len(train)+len(test)-1
predictions1 = AR1fit.predict(start=start, end=end, dynamic=False).rename('AR(1) Predictions')
test['PopEst'].plot(legend=True)
predictions1.plot(legend=True,figsize=(12,6));
# Recall that our model was already created above based on the training set
model2 = AR(train['PopEst'])
AR2fit = model2.fit(maxlag=2)
print(f'Lag: {AR2fit.k_ar}')
print(f'Coefficients:\n{AR2fit.params}')
start=len(train)
end=len(train)+len(test)-1
predictions2 = AR2fit.predict(start=start, end=end, dynamic=False).rename('AR(2) Predictions')
test['PopEst'].plot(legend=True)
predictions1.plot(legend=True)
predictions2.plot(legend=True,figsize=(12,6));
How do we actually find the best number of lags? We can optimize for "p" (number of lags)
If we leave the maxlg empty, statsmodels tries to find the optimal value.
modelp = AR(train['PopEst'])
ARfit = modelp.fit()
print(f'Lag: {ARfit.k_ar}')
print(f'Coefficients:\n{ARfit.params}')
predictionsp = ARfit.predict(start=start, end=end, dynamic=False).rename('AR(p) Predictions')
test['PopEst'].plot(legend=True)
predictions1.plot(legend=True)
predictions2.plot(legend=True)
predictionsp.plot(legend=True,figsize=(12,6));
But we can see that it actually has similar results as AR(2)
What we can do is to change the criterion that statsmodels uses to determine p. We will use the t-stat criterion
modelpp = AR(train['PopEst'])
ARfit = modelpp.fit(ic='t-stat')
print(f'Lag: {ARfit.k_ar}')
print(f'Coefficients:\n{ARfit.params}')
predictionspp = ARfit.predict(start=start, end=end, dynamic=False).rename('AR(pp) Predictions')
test['PopEst'].plot(legend=True)
predictionsp.plot(legend=True)
predictionspp.plot(legend=True,figsize=(12,6));
That looks much better. But let's look at the mean squared error to see which one has a lowe error
preds = [predictions1, predictions2, predictionsp, predictionspp]
labels = ['AR(1)','AR(2)','AR(p)', 'AR(pp)']
for i in range(len(preds)):
error = mean_squared_error(test['PopEst'],preds[i])
print(f'{labels[i]} MSE was :{error}')
The last model, which used the t-stat as criterion is the one with the lower MSE
Now we're ready to train our best model on the greatest amount of data, and fit it to future dates. Using the last model
# First, retrain the model on the full dataset
model = AR(df['PopEst'])
# Next, fit the model
ARfit = model.fit(ic='t-stat')
# Make predictions
fcast = ARfit.predict(start=len(df), end=len(df)+12, dynamic=False).rename('Forecast')
# Plot the results
df['PopEst'].plot(legend=True)
fcast.plot(legend=True,figsize=(12,6));
In upcoming sections we'll talk about different forecasting models like ARMA, ARIMA, Seasonal ARIMA and others. Each model addresses a different type of time series. For this reason, in order to select an appropriate model we need to know something about the data.
In this section we'll learn how to determine if a time series is stationary, if it's independent, and if two series demonstrate correlation and/or causality.
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
# Load a seasonal dataset
df1 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
df1.index.freq = 'MS'
# Load a nonseasonal dataset
df2 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/DailyTotalFemaleBirths.csv',index_col='Date',parse_dates=True)
df2.index.freq = 'D'
from statsmodels.tsa.stattools import ccovf,ccf,periodogram
from statsmodels.tsa.stattools import adfuller,kpss,coint,bds,q_stat,grangercausalitytests,levinson_durbin
from statsmodels.tools.eval_measures import mse, rmse, meanabs
A time series is stationary if the mean and variance are fixed between any two equidistant points. That is, no matter where you take your observations, the results should be the same. A times series that shows seasonality is not stationary.
A test for stationarity usually involves a unit root hypothesis test, where the null hypothesis $H_0$ is that the series is nonstationary, and contains a unit root. The alternate hypothesis $H_1$ supports stationarity. The augmented Dickey-Fuller Test is one such test.
To determine whether a series is stationary we can use the augmented Dickey-Fuller Test. In this test the null hypothesis states that $\phi = 1$ (this is also called a unit test). The test returns several statistics we'll see in a moment. Our focus is on the p-value. A small p-value ($p<0.05$) indicates strong evidence against the null hypothesis.
To demonstrate, we'll use a dataset we know is not stationary, the airline_passenger dataset. First, let's plot the data along with a 12-month rolling mean and standard deviation:
df1['12-month-SMA'] = df1['Thousands of Passengers'].rolling(window=12).mean()
df1['12-month-Std'] = df1['Thousands of Passengers'].rolling(window=12).std()
df1[['Thousands of Passengers','12-month-SMA','12-month-Std']].plot();
Not only is this dataset seasonal with a clear upward trend, the standard deviation increases over time as well.
print('Augmented Dickey-Fuller Test on Airline Data')
# Note that autolag = AIC, Akaike Information Criteria method
dftest = adfuller(df1['Thousands of Passengers'],autolag='AIC')
dftest
To understand what those values mean, you can also check more information with the help() function
help(adfuller)
Since this is a bit annoying, we can customize it for our own use
print('Augmented Dickey-Fuller Test on Airline Data')
dfout = pd.Series(dftest[0:4],index=['ADF test statistic','p-value','# lags used','# observations'])
for key,val in dftest[4].items():
dfout[f'critical value ({key})']=val
print(dfout)
Here we have a very high p-value at 0.99, which provides weak evidence against the null hypothesis, and so we fail to reject the null hypothesis, and decide that our dataset is not stationary.
Note: in statistics we don't "accept" a null hypothesis - nothing is ever truly proven - we just fail to reject it.
Now let's apply the ADF test to stationary data with the Daily Total Female Births dataset.
df2['30-Day-SMA'] = df2['Births'].rolling(window=30).mean()
df2['30-Day-Std'] = df2['Births'].rolling(window=30).std()
df2[['Births','30-Day-SMA','30-Day-Std']].plot();
print('Augmented Dickey-Fuller Test on Daily Female Births')
dftest = adfuller(df2['Births'],autolag='AIC')
dfout = pd.Series(dftest[0:4],index=['ADF test statistic','p-value','# lags used','# observations'])
for key,val in dftest[4].items():
dfout[f'critical value ({key})']=val
print(dfout)
In this case our p-value is very low at 0.000052, and we do reject the null hypothesis. This dataset appears to have no unit root, and is stationary.
Since we'll use it frequently in the upcoming forecasts, let's define a function we can copy into future notebooks for running the augmented Dickey-Fuller test. Remember that we'll still have to import adfuller at the top of our notebook.
from statsmodels.tsa.stattools import adfuller
def adf_test(series,title=''):
"""
Pass in a time series and an optional title, returns an ADF report
"""
print(f'Augmented Dickey-Fuller Test: {title}')
result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
labels = ['ADF test statistic','p-value','# lags used','# observations']
out = pd.Series(result[0:4],index=labels)
for key,val in result[4].items():
out[f'critical value ({key})']=val
print(out.to_string()) # .to_string() removes the line "dtype: float64"
if result[1] <= 0.05:
print("Strong evidence against the null hypothesis")
print("Reject the null hypothesis")
print("Data has no unit root and is stationary")
else:
print("Weak evidence against the null hypothesis")
print("Fail to reject the null hypothesis")
print("Data has a unit root and is non-stationary")
Let's run this function:
adf_test(df1['Thousands of Passengers'])
adf_test(df2['Births'])
The Granger Cauality test tells us wether a series can be considered to be able to forecast another series. It will test it with various tests such as F-test and chi2-test
df3 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/samples.csv',index_col=0, parse_dates=True)
df3.index.freq = 'MS'
df3.head()
grangercausalitytests(df3[['a','c']],maxlag=3)
We need to interpret de p-values and since all of them are >0.05, we can say there is no time-causality in the time series
Two calculations related to linear regression are mean squared error (MSE) and root mean squared error (RMSE)
The formula for the mean squared error is
$MSE = {\frac 1 L} \sum\limits_{l=1}^L (y_{T+l} - \hat y_{T+l})^2$
where $T$ is the last observation period and $l$ is the lag point up to $L$ number of test observations.
The formula for the root mean squared error is
$RMSE = \sqrt{MSE} = \sqrt{{\frac 1 L} \sum\limits_{l=1}^L (y_{T+l} - \hat y_{T+l})^2}$
The advantage of the RMSE is that it is expressed in the same units as the data.
A method similar to the RMSE is the mean absolute error (MAE) which is the mean of the magnitudes of the error, given as
$MAE = {\frac 1 L} \sum\limits_{l=1}^L \mid{y_{T+l}} - \hat y_{T+l}\mid$
A forecast method that minimizes the MAE will lead to forecasts of the median, while minimizing the RMSE will lead to forecasts of the mean.
let's create some random data
np.random.seed(42)
df = pd.DataFrame(np.random.randint(20,30,(50,2)),columns=['test','predictions'])
df.plot(figsize=(12,4));
df.head()
from statsmodels.tools.eval_measures import mse,rmse,meanabs
Let's use the most used metric for evaluating prediction accuracy
rmse(df['test'],df['predictions'])
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
df.index.freq = 'MS'
let's take the data set and check monthly data
from statsmodels.graphics.tsaplots import month_plot, quarter_plot
month_plot(df['Thousands of Passengers']);
This displays the range of values and their means (from the whole data series).
To check the quarter data, we will use:
dfq = df["Thousands of Passengers"].resample(rule='Q').mean()
Before we can apply an ARIMA forecasting model, we need to review the components of one.
ARIMA, or Autoregressive Independent Moving Average is actually a combination of 3 models:
# Load a non-stationary dataset
df1 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/airline_passengers.csv',index_col='Month',parse_dates=True)
df1.index.freq = 'MS'
# Load a stationary dataset
df2 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/DailyTotalFemaleBirths.csv',index_col='Date',parse_dates=True)
df2.index.freq = 'D'
from pmdarima import auto_arima
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
help(auto_arima)
Auto_arima, based on the AIC, will give you the mest combination of p,d,q parameters
auto_arima(df2['Births'])
We specify we want the model to try from 0 to 6 p and from 0 to 3
stepwise_fit = auto_arima(df2['Births'], start_p=0, start_q=0, max_p=6, max_q=3, seasonal=False, trace=True)
For our data, the best p,d,q parameters found are 1,1,1. Note that auto_arima did not try all of the parameters, since he/she realised that the AIC was actually not improving,so it stopped trying more combinations
This shows a recommended (p,d,q) ARIMA Order of (1,1,1), with no seasonal_order component.
We can see how this was determined by looking at the stepwise results. The recommended order is the one with the lowest Akaike information criterion or AIC score. Note that the recommended model may not be the one with the closest fit. The AIC score takes complexity into account, and tries to identify the best forecasting model.
Let's have a look now at the description of the winner parameter-combination
stepwise_fit.summary()
Now let's look at the non-stationary, seasonal Airline Passengers dataset: Note that since we set seasonal=True, the model is also runing a SARIMA model
# With Trace=True we can see the trace results of the parameter combination that the model is using
# m is the type of differencing, so if we difference on quarterly daya, we will set m=4, for monthly m=12, yearly m=1
stepwise_fit = auto_arima(df1['Thousands of Passengers'], start_p=1, start_q=1,
max_p=3, max_q=3, m=12,
start_P=0, seasonal=True,
d=None, D=1, trace=True,
error_action='ignore', # we don't want to know if an order does not work
suppress_warnings=True, # we don't want convergence warnings
stepwise=True) # set to stepwise
stepwise_fit.summary()
The winner is a SARIMA (0,1,1)x(2,1,12) (p,d,q)x(P,D,Q,M). We will talk about SARIMA later
In the table you can see the coefficients for each of the AR & MA lags
This section covers Autoregressive Moving Averages (ARMA) and Autoregressive Integrated Moving Averages (ARIMA).
Recall that an AR(1) model follows the formula
$y_{t} = c + \phi_{1}y_{t-1} + \varepsilon_{t}$
while an MA(1) model follows the formula
$y_{t} = \mu + \theta_{1}\varepsilon_{t-1} + \varepsilon_{t}$
where $c$ is a constant, $\mu$ is the expectation of $y_{t}$ (often assumed to be zero), $\phi_1$ (phi-sub-one) is the AR lag coefficient, $\theta_1$ (theta-sub-one) is the MA lag coefficient, and $\varepsilon$ (epsilon) is white noise.
An ARMA(1,1) model therefore follows
$y_{t} = c + \phi_{1}y_{t-1} + \theta_{1}\varepsilon_{t-1} + \varepsilon_{t}$
ARMA models can be used on stationary datasets.
For non-stationary datasets with a trend component, ARIMA models apply a differencing coefficient as well.
In this first section we'll look at a stationary dataset, determine (p,q) orders, and run a forecasting ARMA model fit to the data. In practice it's rare to find stationary data with no trend or seasonal component, but the first four months of the Daily Total Female Births dataset should work for our purposes.
# Load specific forecasting tools
from statsmodels.tsa.arima_model import ARMA,ARMAResults,ARIMA,ARIMAResults
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
from pmdarima import auto_arima # for determining ARIMA orders
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
# Load datasets
df1 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/DailyTotalFemaleBirths.csv',index_col='Date',parse_dates=True)
df1.index.freq = 'D'
df1 = df1[:120] # we only want the first four months
df2 = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/TradeInventories.csv',index_col='Date',parse_dates=True)
df2.index.freq='MS'
# this function was created previously (control f adf_test)
adf_test(df1["Births"])
This tool should give just $p$ and $q$ value recommendations for this dataset.
auto_arima(df1["Births"],seasonal=False).summary()
As a general rule you should set the length of your test set equal to your intended forecast size. For this dataset we'll attempt a 1-month forecast.
# Set one month for testing
train = df1.iloc[:90]
test = df1.iloc[90:]
If you want you can run help(ARMA) to learn what incoming arguments are available/expected, and what's being returned.
model = ARMA(train['Births'],order=(2,2))
results = model.fit()
results.summary()
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end).rename('ARMA(2,2) Predictions')
title = 'Daily Total Female Births'
ylabel='Births'
xlabel='' # we don't really need a label here
ax = test['Births'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
The orange line is the predicted values, which even though it looks like it is a very bad prediction, it makes completely sense since ARMA shows the average value. The model is not able to predict the noise, but is able to predict the average value.
In fact if we calculate the average value of test and predictions, they have the same average value:
test.mean()
predictions.mean()
Now let's work on the ARIMA mode,so we will ad the I component into the ARMA model:
# HERE'S A TRICK TO ADD COMMAS TO Y-AXIS TICK VALUES
import matplotlib.ticker as ticker
formatter = ticker.StrMethodFormatter('{x:,.0f}')
title = 'Real Manufacturing and Trade Inventories'
ylabel='Chained 2012 Dollars'
xlabel='' # we don't really need a label here
ax = df2['Inventories'].plot(figsize=(12,5),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
ax.yaxis.set_major_formatter(formatter);
We probably won't learn a lot from it, but it never hurts to run an ETS Decomposition plot.
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df2['Inventories'], model='additive') # model='add' also works
result.plot();
Here we see that the seasonal component does not contribute significantly to the behavior of the series.
So for the purpose of this code, we will ignore the seasonal component (otherwise we should use SARIMA, instead of ARIMA)
auto_arima(df2['Inventories'],seasonal=False).summary()
This suggests that we should fit an SARIMAX(0,1,0) model to best forecast future values of the series. Before we train the model, let's look at augmented Dickey-Fuller Test, and the ACF/PACF plots to see if they agree. These steps are optional, and we would likely skip them in practice.
from statsmodels.tsa.statespace.tools import diff
df2['d1'] = diff(df2['Inventories'],k_diff=1)
# Equivalent to:
# df1['d1'] = df1['Inventories'] - df1['Inventories'].shift(1)
adf_test(df2['d1'],'Real Manufacturing and Trade Inventories')
This confirms that we reached stationarity after the first difference.
A PACF Plot can reveal recommended AR(p) orders, and an ACF Plot can do the same for MA(q) orders.
Alternatively, we can compare the stepwise Akaike Information Criterion (AIC) values across a set of different (p,q) combinations to choose the best combination.
title = 'Autocorrelation: Real Manufacturing and Trade Inventories'
lags = 40
plot_acf(df2['Inventories'],title=title,lags=lags);
title = 'Partial Autocorrelation: Real Manufacturing and Trade Inventories'
lags = 40
plot_pacf(df2['Inventories'],title=title,lags=lags);
This tells us that the AR component should be more important than MA. From the Duke University Statistical Forecasting site:
If the PACF displays a sharp cutoff while the ACF decays more slowly (i.e., has significant spikes at higher lags), we say that the stationarized series displays an "AR signature," meaning that the autocorrelation pattern can be explained more easily by adding AR terms than by adding MA terms.
Let's take a look at pmdarima.auto_arima done stepwise to see if having $p$ and $q$ terms the same still makes sense:
stepwise_fit = auto_arima(df2['Inventories'], start_p=0, start_q=0,
max_p=2, max_q=2, m=12,
seasonal=False,
d=None, trace=True,
error_action='ignore', # we don't want to know if an order does not work
suppress_warnings=True, # we don't want convergence warnings
stepwise=True) # set to stepwise
stepwise_fit.summary()
len(df2)
# Set one year for testing
train = df2.iloc[:252]
test = df2.iloc[252:]
model = ARIMA(train['Inventories'],order=(1,1,1))
results = model.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('ARIMA(1,1,1) Predictions')
Passing dynamic=False means that forecasts at each point are generated using the full history up to that point (all lagged values).
Passing typ='levels' predicts the levels of the original endogenous variables. If we'd used the default typ='linear' we would have seen linear predictions in terms of the differenced endogenous variables.
For more information on these arguments visit https://www.statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.predict.html
# Compare predictions to expected values
for i in range(len(predictions)):
print(f"predicted={predictions[i]:<11.10}, expected={test['Inventories'][i]}")
# Plot predictions against known values
title = 'Real Manufacturing and Trade Inventories'
ylabel='Chained 2012 Dollars'
xlabel='' # we don't really need a label here
ax = test['Inventories'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
ax.yaxis.set_major_formatter(formatter);
from sklearn.metrics import mean_squared_error
error = mean_squared_error(test['Inventories'], predictions)
print(f'ARIMA(1,1,1) MSE Error: {error:11.10}')
from statsmodels.tools.eval_measures import rmse
error = rmse(test['Inventories'], predictions)
print(f'ARIMA(1,1,1) RMSE Error: {error:11.10}')
model = ARIMA(df2['Inventories'],order=(1,1,1))
results = model.fit()
fcast = results.predict(len(df2),len(df2)+11,typ='levels').rename('ARIMA(1,1,1) Forecast')
# Plot predictions against known values
title = 'Real Manufacturing and Trade Inventories'
ylabel='Chained 2012 Dollars'
xlabel='' # we don't really need a label here
ax = df2['Inventories'].plot(legend=True,figsize=(12,6),title=title)
fcast.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
ax.yaxis.set_major_formatter(formatter);
We have finally reached one of the most fascinating aspects of time series analysis: seasonality.
Where ARIMA accepts the parameters $(p,d,q)$, SARIMA accepts an additional set of parameters $(P,D,Q)m$ that specifically describe the seasonal components of the model. Here $P$, $D$ and $Q$ represent the seasonal regression, differencing and moving average coefficients, and $m$ represents the number of data points (rows) in each seasonal cycle.
NOTE: The statsmodels implementation of SARIMA is called SARIMAX. The “X” added to the name means that the function also supports exogenous regressor variables. We'll cover these in the next section.
# Load specific forecasting tools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
from statsmodels.tsa.seasonal import seasonal_decompose # for ETS Plots
from pmdarima import auto_arima # for determining ARIMA orders
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
# Load dataset
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/co2_mm_mlo.csv')
df.head()
We need to combine two integer columns (year and month) into a DatetimeIndex. We can do this by passing a dictionary into pandas.to_datetime() with year, month and day values.
For more information visit https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
# Add a "date" datetime column
df['date']=pd.to_datetime(dict(year=df['year'], month=df['month'], day=1))
#another way to merge dates is pd.to_datetime({'year':df['year'],'month':df['month'],'day':1})
# Set "date" to be the index
df.set_index('date',inplace=True)
df.index.freq = 'MS'
df.head()
df.info()
title = 'Monthly Mean CO₂ Levels (ppm) over Mauna Loa, Hawaii'
ylabel='parts per million'
xlabel='' # we don't really need a label here
ax = df['interpolated'].plot(figsize=(12,6),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
result = seasonal_decompose(df['interpolated'], model='add')
result.plot();
Although small in scale compared to the overall values, there is a definite annual seasonality.
This may take awhile as there are a lot more combinations to evaluate.
# For SARIMA Orders we set seasonal=True and pass in an m value
# since seasonality is yearl and we have monthly data then m=12
auto_arima(df['interpolated'],seasonal=True,m=12).summary()
# Set one year for testing
train = df.iloc[:717]
test = df.iloc[717:]
model = SARIMAX(train['interpolated'],order=(0,1,3),seasonal_order=(1,0,1,12))
results = model.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('SARIMA(0,1,3)(1,0,1,12) Predictions')
Passing dynamic=False means that forecasts at each point are generated using the full history up to that point (all lagged values).
Passing typ='levels' predicts the levels of the original endogenous variables. If we'd used the default typ='linear' we would have seen linear predictions in terms of the differenced endogenous variables.
For more information on these arguments visit https://www.statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.predict.html
# Compare predictions to expected values
for i in range(len(predictions)):
print(f"predicted={predictions[i]:<11.10}, expected={test['interpolated'][i]}")
# Plot predictions against known values
title = 'Monthly Mean CO₂ Levels (ppm) over Mauna Loa, Hawaii'
ylabel='parts per million'
xlabel=''
ax = test['interpolated'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
from sklearn.metrics import mean_squared_error
error = mean_squared_error(test['interpolated'], predictions)
print(f'SARIMA(0,1,3)(1,0,1,12) MSE Error: {error:11.10}')
from statsmodels.tools.eval_measures import rmse
error = rmse(test['interpolated'], predictions)
print(f'SARIMA(0,1,3)(1,0,1,12) RMSE Error: {error:11.10}')
Remember that in order to understand and intepret de error you need always to look at the data.
test['interpolated'].mean()
These are outstanding results!, since a RMSE of 0.35 is tiny compared to 408.333
model = SARIMAX(df['interpolated'],order=(0,1,3),seasonal_order=(1,0,1,12))
results = model.fit()
fcast = results.predict(len(df),len(df)+11,typ='levels').rename('SARIMA(0,1,3)(1,0,1,12) Forecast')
# Plot predictions against known values
title = 'Monthly Mean CO₂ Levels (ppm) over Mauna Loa, Hawaii'
ylabel='parts per million'
xlabel=''
ax = df['interpolated'].plot(legend=True,figsize=(12,6),title=title)
fcast.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
So far the models we've looked at consider past values of a dataset and past errors to determine future trends, seasonality and forecasted values. We look now to models that encompass these non-seasonal (p,d,q) and seasonal (P,D,Q,m) factors, but introduce the idea that external factors (environmental, economic, etc.) can also influence a time series, and be used in forecasting.
import pandas as pd
import numpy as np
%matplotlib inline
# Load specific forecasting tools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
from statsmodels.tsa.seasonal import seasonal_decompose # for ETS Plots
from pmdarima import auto_arima # for determining ARIMA orders
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
# Load dataset
df = pd.read_csv('./original/TSA_COURSE_NOTEBOOKS/Data/RestaurantVisitors.csv',index_col='date',parse_dates=True)
df.index.freq = 'D'
For this section we've built a Restaurant Visitors dataset that was inspired by a recent Kaggle competition. The data considers daily visitors to four restaurants located in the United States, subject to American holidays. For the exogenous variable we'll see how holidays affect patronage. The dataset contains 478 days of restaurant data, plus an additional 39 days of holiday data for forecasting purposes.
Notice that even though the restaurant visitor columns contain integer data, they appear as floats. This is because the bottom of the dataframe has 39 rows of NaN data to accommodate the extra holiday data we'll use for forecasting, and pandas won't allow NaN's as integers. We could leave it like this, but since we have to drop NaN values anyway, let's also convert the columns to dtype int64.
df1 = df.dropna()
df1.tail()
# Change the dtype of selected columns from float to int (we cannot isk having 1.5 customers)
cols = ['rest1','rest2','rest3','rest4','total']
for col in cols:
df1[col] = df1[col].astype(int)
df1.head()
title='Restaurant Visitors'
ylabel='Visitors per day'
xlabel='' # we don't really need a label here
ax = df1['total'].plot(figsize=(16,5),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
Rather than prepare a separate plot, we can use matplotlib to shade holidays behind our restaurant data.
title='Restaurant Visitors'
ylabel='Visitors per day'
xlabel='' # we don't really need a label here
ax = df1['total'].plot(figsize=(16,5),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
for x in df1.query('holiday==1').index: # for days where holiday == 1 #you can also do it the classical way: df1[df1['holiday']==1].index
ax.axvline(x=x, color='k', alpha = 0.3); # add a semi-transparent grey line
result = seasonal_decompose(df1['total'])
result.plot();
result.seasonal.plot(figsize=(18,5));
from statsmodels.tsa.stattools import adfuller
def adf_test(series,title=''):
"""
Pass in a time series and an optional title, returns an ADF report
"""
print(f'Augmented Dickey-Fuller Test: {title}')
result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
labels = ['ADF test statistic','p-value','# lags used','# observations']
out = pd.Series(result[0:4],index=labels)
for key,val in result[4].items():
out[f'critical value ({key})']=val
print(out.to_string()) # .to_string() removes the line "dtype: float64"
if result[1] <= 0.05:
print("Strong evidence against the null hypothesis")
print("Reject the null hypothesis")
print("Data has no unit root and is stationary")
else:
print("Weak evidence against the null hypothesis")
print("Fail to reject the null hypothesis")
print("Data has a unit root and is non-stationary")
adf_test(df1['total'])
This may take awhile as there are a lot of combinations to evaluate.
# For SARIMA Orders we set seasonal=True and pass in an m value
auto_arima(df1['total'],seasonal=True,m=7).summary()
Excellent! This provides an ARIMA Order of (1,0,0) and a seasonal order of (2,0,0,7) Now let's train & test the SARIMA model, evaluate it, then compare the result to a model that uses an exogenous variable.
We'll assign 42 days (6 weeks) to the test set so that it includes several holidays.
# Set four weeks for testing
train = df1.iloc[:436]
test = df1.iloc[436:]
NOTE: To avoid a ValueError: non-invertible starting MA parameters found we're going to set enforce_invertibility to False.
model = SARIMAX(train['total'],order=(1,0,0),seasonal_order=(2,0,0,7),enforce_invertibility=False)
#Why invertible process ? invertibility means that recent lags have more weigh than old lags. In other words
#the coefficient theta are < 1. Statsmodels believes this is the "normal", so if forces the coefficients to be
#always <1. But we don't want this, so we don't want to enfornce invertibility. We want to allow that the coefficients
#of the lags are => 1
results = model.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end, dynamic=False).rename('SARIMA(1,0,0)(2,0,0,7) Predictions')
Passing dynamic=False means that forecasts at each point are generated using the full history up to that point (all lagged values).
For more information on these arguments visit https://www.statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMAResults.predict.html
# Plot predictions against known values
title='Restaurant Visitors'
ylabel='Visitors per day'
xlabel=''
ax = test['total'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
for x in test.query('holiday==1').index:
ax.axvline(x=x, color='k', alpha = 0.3);
from sklearn.metrics import mean_squared_error
from statsmodels.tools.eval_measures import rmse
error1 = mean_squared_error(test['total'], predictions)
error2 = rmse(test['total'], predictions)
print(f'SARIMA(1,0,0)(2,0,0,7) MSE Error: {error1:11.10}')
print(f'SARIMA(1,0,0)(2,0,0,7) RMSE Error: {error2:11.10}')
model = SARIMAX(train['total'],exog=train['holiday'],order=(1,0,0),seasonal_order=(2,0,0,7),enforce_invertibility=False)
results = model.fit()
results.summary()
# Obtain predicted values
start=len(train)
end=len(train)+len(test)-1
exog_forecast = test[['holiday']] # requires two brackets to yield a shape of (35,1)
predictions = results.predict(start=start, end=end, exog=exog_forecast).rename('SARIMAX(1,0,0)(2,0,0,7) Predictions')
# Plot predictions against known values
title='Restaurant Visitors'
ylabel='Visitors per day'
xlabel=''
ax = test['total'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
for x in test.query('holiday==1').index:
ax.axvline(x=x, color='k', alpha = 0.3);
We can see that the exogenous variable (holidays) had a positive impact on the forecast by raising predicted values at 3/17, 4/14, 4/16 and 4/17! Let's compare evaluations:
# Print values from SARIMA above
print(f'SARIMA(1,0,0)(2,0,0,7) MSE Error: {error1:11.10}')
print(f'SARIMA(1,0,0)(2,0,0,7) RMSE Error: {error2:11.10}')
print()
error1x = mean_squared_error(test['total'], predictions)
error2x = rmse(test['total'], predictions)
# Print new SARIMAX values
print(f'SARIMAX(1,0,0)(2,0,0,7) MSE Error: {error1x:11.10}')
print(f'SARIMAX(1,0,0)(2,0,0,7) RMSE Error: {error2x:11.10}')
Our RMSE is lower, which means adding the exogenous variable helps us to predict better the future
We're going to forecast 39 days into the future, and use the additional holiday data
model = SARIMAX(df1['total'],exog=df1['holiday'],order=(1,0,0),seasonal_order=(2,0,0,7),enforce_invertibility=False)
results = model.fit()
exog_forecast = df[478:][['holiday']]
fcast = results.predict(len(df1),len(df1)+38,exog=exog_forecast).rename('SARIMAX(1,0,0)(2,0,0,7) Forecast')
# Plot the forecast alongside historical values
title='Restaurant Visitors'
ylabel='Visitors per day'
xlabel=''
ax = df1['total'].plot(legend=True,figsize=(16,6),title=title)
fcast.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
for x in df.query('holiday==1').index:
ax.axvline(x=x, color='k', alpha = 0.3);